공간적 자기상관을 고려한
기계학습 기반 침수 위험 지역 예측

- Spatial Random Forest Modeling with Eigenvector Spatial Filtering

Author

인문대학 고고미술사학과
2019-13439
정송희

Published

June 26, 2023

library(tidyverse)
library(rmarkdown)
library(corrplot)
library(sf)
library(terra)
library(tmap)
library(whitebox)
library(spatialEco)
library(car)
library(tidymodels)
library(rpart.plot)
library(vip)
library(randomForest)
library(adespatial)
library(ade4)
library(adegraphics)
library(spdep)
library(maptools)
library(spatialRF)
library(randomForestExplainer)
library(doParallel)

1. Data

Variables

대분류 중분류 소분류
Feature (예측변수) Terrain (지형) Elevation (고도)
Feature (예측변수) Terrain (지형) Slope (경사도)
Feature (예측변수) Terrain (지형) Curvature (곡률)
Feature (예측변수) Hydrology (수문) Topographic Wetness Index; TWI (지형 습윤 지수)
Feature (예측변수) Hydrology (수문) Distance from River (강으로부터의 거리)
Feature (예측변수) Hydrology (수문) Distance from Sea (바다로부터의 거리)
Feature (예측변수) Cover (피복) Normalized Difference Vegetation Index; NDVI (정규 식생 지수)
Feature (예측변수) Cover (피복) Normalized Difference Built-up Index; NDBI (정규 시가화 지수)
Feature (예측변수) Cover (피복) Soil Drain (토양 배수 등급)
Feature (예측변수) Drainage (배수) Manhole (맨홀 개수)
Feature (예측변수) Drainage (배수) Pump Station (유역면적 대비 배수 펌프장 용량)
Target (결과 변수) Occurrence of Flood (침수 여부)

Source

Load

# study area
area <- st_read("data/shp/Busan.shp")
Reading layer `Busan' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Busan.shp' using driver `ESRI Shapefile'
Simple feature collection with 1 feature and 1 field
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 1115085 ymin: 1655365 xmax: 1163986 ymax: 1711699
Projected CRS: ITRF_2000_UTM_K
# target variable
floodtype <- st_read("data/shp/Flood_region.shp")
Reading layer `Flood_region' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Flood_region.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 167 features and 5 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 1115085 ymin: 1655365 xmax: 1163986 ymax: 1711699
Projected CRS: ITRF_2000_UTM_K
floodtype <- st_transform(floodtype, crs(area))

# Feature variable
## Terrain
### Elevation
#raw_1 <- rast("data/raster/n34_e128_1arc_v3.tif")
#raw_2 <- rast("data/raster/n34_e129_1arc_v3.tif")
#raw_3 <- rast("data/raster/n35_e128_1arc_v3.tif")
#raw_4 <- rast("data/raster/n35_e129_1arc_v3.tif")
#dem <- mosaic(raw_1, raw_2, raw_3, raw_4)
#dem <- project(dem, crs(area))
#writeRaster(dem, "data/raster/DEM.tif", overwrite = TRUE)
dem <- rast("data/raster/DEM.tif")
### Slope
#wbt_slope(dem = "data/raster/DEM_filled_breached.tif",
          #output = "data/raster/Slope.tif",
          #units = "degrees")
slope <- rast("data/raster/Slope.tif")
### Curvature
#curv <- curvature(dem, type = "total")
#writeRaster(curv, "data/raster/Curv.tif", overwrite=TRUE)
curv <- rast("data/raster/Curv.tif")
curv[is.na(curv)] <- 0
## Hydrology
### Topographic Wetness Index
#wbt_breach_depressions_least_cost(
  #dem = "data/raster/DEM.tif",
  #output = "data/raster/DEM_breached.tif",
  #dist = 5,
  #fill = TRUE)
#wbt_fill_depressions_wang_and_liu(
  #dem = "data/raster/DEM_breached.tif",
  #output = "data/raster/DEM_filled_breached.tif")
#wbt_d_inf_flow_accumulation("data/raster/DEM_filled_breached.tif",
                            #"data/raster/DEM_flowaccum.tif")
#wbt_wetness_index(sca = "data/raster/DEM_flowaccum.tif",
                  #slope = "data/raster/Slope.tif",
                  #output = "data/raster/TWI.tif")
twi <- rast("data/raster/TWI.tif")
### Distance from Stream
area_rast <- dem
area_rast[] <- 0
stream <- st_read("data/shp/River_BUSAN.shp")
Reading layer `River_BUSAN' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\River_BUSAN.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 354 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 1119062 ymin: 1670733 xmax: 1162374 ymax: 1710879
Projected CRS: Korea 2000 / Unified CS
stream <- st_transform(stream, crs(area))
dist_str <- terra::distance(rasterize(stream, area_rast))
### Distance from Coastline
coastline <- st_read("data/shp/Coastline_BUSAN.shp")
Reading layer `Coastline_BUSAN' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Coastline_BUSAN.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 1968 features and 7 fields
Geometry type: MULTILINESTRING
Dimension:     XY
Bounding box:  xmin: 1115085 ymin: 1655365 xmax: 1163986 ymax: 1705327
Projected CRS: Korea 2000 / Unified CS
coastline <- st_transform(coastline, crs(area))
dist_coast <- terra::distance(rasterize(coastline, area_rast))
## Cover
### NDVI
#Nor_B4 <- rast("data/raster/LC08_L2SP_114035_20230407_20230420_02_T1_SR_B4.tif")
#Nor_B5 <- rast("data/raster/LC08_L2SP_114035_20230407_20230420_02_T1_SR_B5.tif")
#Sou_B4 <- rast("data/raster/LC08_L2SP_114036_20230407_20230420_02_T1_SR_B4.tif")
#Sou_B5 <- rast("data/raster/LC08_L2SP_114036_20230407_20230420_02_T1_SR_B5.tif")
#LST_Band4 <- mosaic(Nor_B4, Sou_B4)
#LST_Band5 <- mosaic(Nor_B5, Sou_B5)
#red <- LST_Band4[[1]]
#nir <- LST_Band5[[1]]
#ndviCal <- function(red, nir) {
    #ndviArray <- (nir - red)/(nir + red)
    #return(ndviArray)
    #}
#ndvi <- ndviCal(red,nir)
#ndvi <- project(ndvi, crs(area))
#writeRaster(ndvi,"data/raster/NDVI.tif", overwrite = TRUE)
ndvi <- rast("data/raster/NDVI.tif")
ndvi <- resample(ndvi, dem)
### NDBI
#Nor_B6 <- rast("data/raster/LC08_L2SP_114035_20230407_20230420_02_T1_SR_B6.tif")
#Sou_B6 <- rast("data/raster/LC08_L2SP_114036_20230407_20230420_02_T1_SR_B6.tif")
#LST_Band6 <- mosaic(Nor_B6, Sou_B6)
#nir <- LST_Band5[[1]]
#swir <- LST_Band6[[1]]
#ndbiCal <- function(nir, swir) {
   #ndbiArray <- (swir - nir)/(swir + nir)
   #return(ndbiArray)
   #}
#ndbi <- ndbiCal(nir, swir)
#ndbi <- project(ndbi, crs(area))
#writeRaster(ndbi,"data/raster/NDBI.tif", overwrite = TRUE)
ndbi <- rast("data/raster/NDBI.tif")
ndbi <- resample(ndbi, dem)
## Drainage
### Soil Drain
drain <- st_read("data/shp/Z_SIS_ASIT_SOILDRAIN_AREA_BUSAN.shp")
Reading layer `Z_SIS_ASIT_SOILDRAIN_AREA_BUSAN' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Z_SIS_ASIT_SOILDRAIN_AREA_BUSAN.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 6 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 1115085 ymin: 1668368 xmax: 1163986 ymax: 1711699
Projected CRS: ITRF_2000_UTM_K
drain <- st_transform(drain, crs(area))
drain$SOILDRA <- as.numeric(drain$SOILDRA)
### Manhole
area_emd <- st_read("data/shp/Z_SOP_BND_ADM_DONG_PG.shp", options = "ENCODING=CP949")
options:        ENCODING=CP949 
Reading layer `Z_SOP_BND_ADM_DONG_PG' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Z_SOP_BND_ADM_DONG_PG.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 3501 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -10045 ymin: -42050.55 xmax: 632508.9 ymax: 568996.4
Projected CRS: ITRF_2000_TM_Korea_Central_Belt
area_emd <- area_emd %>% filter(str_starts(ADM_DR_CD, '21'))
area_emd <- st_transform(area_emd, crs(area))
area_emd$area <- st_area(area_emd)
manhole <- read_csv("data/table/부산광역시_부산도시공간정보시스템_도로상하수도기반시설물_하수맨홀.csv")
colnames(manhole)[7] <- "ADM_DR_NM"
manhole <- manhole %>% group_by(ADM_DR_NM) %>% summarize(count = n())
manhole <- area_emd %>% left_join(manhole)
manhole[is.na(manhole)] <- 0
manhole$count <- as.numeric(manhole$count)
manhole <- manhole %>% mutate("manhole" = count/area)
### Pump
pump <- read_csv("./data/table/부산광역시_배수 펌프장 현황_20230324.csv")[c(1,2,3,8)]
basin <- st_read("./data/shp/Busan_basin.shp")
Reading layer `Busan_basin' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Busan_basin.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 15 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 363435.9 ymin: 167327.8 xmax: 409521.4 ymax: 212478.8
Projected CRS: PCS_ITRF2000_TM
basin <- st_transform(basin, crs(area))
clientId <-  "4kb73whdwq"
clientSecret <- "1y4snfRusPuUQmnDUNySoZmubznDyC1BfeuHrtWH"
no_cores <- detectCores(logical = F) - 1  
cl <- makeCluster(no_cores, type="PSOCK")  
registerDoParallel(cl)  
result <- foreach(i=1:length(pump$위치),.combine = dplyr::bind_rows, .packages = c("dplyr","httr","XML")) %dopar% {
  apiResult <- httr::GET( 
    url = "https://naveropenapi.apigw.ntruss.com/map-geocode/v2/geocode", 
    httr::add_headers(
      `X-NCP-APIGW-API-KEY-ID` = clientId, 
      `X-NCP-APIGW-API-KEY` = clientSecret
    ),
    query = list(
      `query` = pump$위치[i]
    )
  )
  if (apiResult$status_code == "200"){
    #print("ResultCode OK!")
    result <- base::rawToChar(apiResult$content)  
    base::Encoding(result) <- "UTF-8"
    list <- XML::xmlToList(result)
    if (base::is.null(x = list$addresses$x)){
      x <- -1e7+1
      y <- -1e7+1
    }
    else {
      x <- base::as.numeric(list$addresses$x)
      y <- base::as.numeric(list$addresses$y)
    }
  }
  else if (apiResult$status_code == "400"){
    base::cat(paste("Bad Request Exception in",pump$배수펌프장명[i]))
  }
  else if (apiResult$status_code == "500"){
    base::cat(paste("Unexpected Error in",pump$배수펌프장명[i]))
  }
  else{
    base::cat("몰라요")
  }
  base::cbind(pump[i,],x,y) |> as_tibble()
}
stopCluster(cl)
result <- result %>% filter(x >= 127)
sf_pump <- st_as_sf(result, coords = c("x", "y"), 
                    crs = 4326, agr = "constant") %>%
  st_transform(st_crs(basin))
joined <- st_join(sf_pump, basin, join = st_intersects)
joined$SBSNNM <- as.character(joined$SBSNNM)
joined$`배수량(제곱미터_분)` <- as.numeric(joined$`배수량(제곱미터_분)`)
joined <- joined %>% rename("배수량" = "배수량(제곱미터_분)")
pump_joind <- joined %>% st_drop_geometry() %>% 
  group_by(SBSNNM) %>% 
  summarise(sum = sum(배수량))
pump_joind <- na.omit(pump_joind)
pump_joind <- basin %>% left_join(pump_joind)
pump_joind[is.na(pump_joind)] <- 0
pump_joind$area <- st_area(pump_joind)
pump <- pump_joind %>% mutate("ratio" = sum/area)

v_dem <- mask(dem, area)
v_slope <- mask(slope, area)
v_curv <- mask(curv, area)
v_twi <- mask(twi, area)
v_dist_str <- mask(dist_str, area)
v_dist_coast <- mask(dist_coast, area)
v_ndvi <- mask(ndvi, area)
v_ndbi <- mask(ndbi, area)
v_dem <- crop(v_dem, ext(c(1110000, 1170000, 1650000, 1720000)))
v_slope <- crop(v_slope, ext(c(1110000, 1170000, 1650000, 1720000)))
v_curv <- crop(v_curv, ext(c(1110000, 1170000, 1650000, 1720000)))
v_twi <- crop(v_twi, ext(c(1110000, 1170000, 1650000, 1720000)))
v_dist_str <- crop(v_dist_str, ext(c(1110000, 1170000, 1650000, 1720000)))
v_dist_coast <- crop(v_dist_coast, ext(c(1110000, 1170000, 1650000, 1720000)))
v_ndvi <- crop(v_ndvi, ext(c(1110000, 1170000, 1650000, 1720000)))
v_ndbi <- crop(v_ndbi, ext(c(1110000, 1170000, 1650000, 1720000)))

2. Variable

Feature

Terrain

tmap_mode("plot")

tm_shape(v_dem) +
  tm_raster(style = "cont", palette = "-YlGn", legend.show = TRUE, title = "elevation")

tm_shape(v_slope) +
  tm_raster(style = "cont", palette = "YlGnBu", legend.show = TRUE, title = "slope")

tm_shape(v_curv) +
  tm_raster(style = "cont", palette = "PuOr", legend.show = TRUE, title = "curvature")

Hydrology

tm_shape(v_twi) +
  tm_raster(style = "cont", palette = "Blues", legend.show = TRUE, title = "twi") 

tm_shape(area) + tm_fill(col = 'darkgrey') +
 tm_shape(stream) + tm_fill(col = "skyblue")

tm_shape(v_dist_str) +
 tm_raster(style = "cont", palette = "-Blues", legend.show = TRUE, title = "dist_str")

tm_shape(area) + tm_fill(col = 'darkgrey') +
 tm_shape(coastline) + tm_lines(col = "skyblue")

tm_shape(v_dist_coast) +
 tm_raster(style = "cont", palette = "-Blues", legend.show = TRUE, title = "dist_coast")

Cover

tm_shape(v_ndvi) +
  tm_raster(style = "cont", palette = "RdYlGn", legend.show = TRUE, title = "ndvi")

tm_shape(v_ndbi) +
  tm_raster(style = "cont", palette = "-PiYG", legend.show = TRUE, title = "ndbi") 

tm_shape(drain) +
 tm_fill(col = "SOILDRA", title = "Drain") +
 tm_legend(outside = TRUE)

Drainage

tm_shape(manhole) +
 tm_polygons('count', palette = "BuPu", title = "manhole") +
 tm_legend(outside = TRUE)

tm_shape(basin) + tm_polygons() +
  tm_shape(sf_pump) + tm_dots(col = "#FC4E07", size = 0.05)

tm_shape(pump) + 
  tm_polygons('ratio', palette = "Reds") +
  tm_layout(legend.outside = TRUE)

Target

tm_shape(floodtype) + tm_fill(col = "type", palette = "RdGy", title = "Flood") +
 tm_legend(outside = TRUE)

3. Summary

Statistics

drain <- rasterize(drain, area_rast, field = "SOILDRA")
manhole <- rasterize(manhole, area_rast, field = "manhole")
pump <- rasterize(pump, area_rast, field = "ratio")
drain <- as.numeric(drain)
manhole <- as.numeric(manhole)
pump <- as.numeric(pump)

df_pred <- c(dem, slope, curv, twi, dist_str, dist_coast, ndvi, ndbi, drain, manhole, pump)
df_dep <- rasterize(floodtype, area_rast, field = "type")
df_dep[is.na(df_dep)] <- 0
df <- c(df_pred, df_dep)
names(df) <- c("dem", "slope", "curv", "twi", "dist_str", "dist_coast", "ndvi", "ndbi", "drain", "manhole", "pump", "type")

df_busan <- mask(df, area)
df_table <- as.data.frame(df_busan)
df_table$type <- as.factor(df_table$type)
df_table <- na.omit(df_table)
summary(df_table)
      dem             slope               curv                 twi        
 Min.   :-20.73   Min.   : 0.00001   Min.   :-3.061e-02   Min.   : 3.026  
 1st Qu.: 10.79   1st Qu.: 2.07346   1st Qu.:-1.187e-03   1st Qu.: 5.616  
 Median : 69.05   Median :10.01638   Median :-4.200e-05   Median : 6.682  
 Mean   :113.20   Mean   :11.23088   Mean   : 1.788e-05   Mean   : 7.666  
 3rd Qu.:171.04   3rd Qu.:18.40729   3rd Qu.: 1.063e-03   3rd Qu.: 8.385  
 Max.   :773.91   Max.   :61.17732   Max.   : 3.260e-02   Max.   :33.700  
    dist_str        dist_coast         ndvi               ndbi          
 Min.   :   0.0   Min.   :    0   Min.   :-0.10488   Min.   :-0.319478  
 1st Qu.: 205.4   1st Qu.: 1643   1st Qu.: 0.09823   1st Qu.:-0.119984  
 Median : 480.5   Median : 4593   Median : 0.19082   Median :-0.053191  
 Mean   : 598.3   Mean   : 5219   Mean   : 0.18116   Mean   :-0.063186  
 3rd Qu.: 846.4   3rd Qu.: 8120   3rd Qu.: 0.26450   3rd Qu.:-0.009993  
 Max.   :4126.3   Max.   :16443   Max.   : 0.42337   Max.   : 0.377114  
     drain          manhole         pump          type       
 Min.   :1.000   Min.   :  0   Min.   :0.000   FALSE:929563  
 1st Qu.:1.000   1st Qu.: 44   1st Qu.:0.000   TRUE : 14226  
 Median :2.000   Median :131   Median :3.000                 
 Mean   :2.973   Mean   :111   Mean   :3.237                 
 3rd Qu.:5.000   3rd Qu.:156   3rd Qu.:6.000                 
 Max.   :6.000   Max.   :200   Max.   :8.000                 

Histogram

ggplot(data = df_table, mapping = aes(x = dem, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

ggplot(data = df_table, mapping = aes(x = slope, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

ggplot(data = df_table, mapping = aes(x = curv, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

ggplot(data = df_table, mapping = aes(x = twi, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

ggplot(data = df_table, mapping = aes(x = dist_str, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

ggplot(data = df_table, mapping = aes(x = dist_coast, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

ggplot(data = df_table, mapping = aes(x = ndvi, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

ggplot(data = df_table, mapping = aes(x = ndbi, fill = type, color = type)) +
 geom_density(alpha = 0.5, position = "identity")

Scatterplot

df_cor <- df_table[c(-12)]
df_cor <- cor(df_cor)
corrplot(df_cor, type="full", order="hclust", tl.col="black", tl.srt=45)

4. Point Sampling

flood <- st_read("data/shp/Flood_area.shp")
Reading layer `Flood_area' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Flood_area.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 166 features and 16 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 128.8218 ymin: 35.04778 xmax: 129.2761 ymax: 35.37124
Geodetic CRS:  WGS 84
flood <- st_transform(flood, crs(area))
nonflood <- st_read("data/shp/Nonflood_area.shp")
Reading layer `Nonflood_area' from data source 
  `C:\Users\songh\Desktop\Project_2\data\shp\Nonflood_area.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 1 feature and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 1115085 ymin: 1655365 xmax: 1163986 ymax: 1711699
Projected CRS: ITRF_2000_UTM_K
nonflood <- st_transform(nonflood, crs(area))

rp_T <- st_sample(flood, 250)
rp_T_att = data.frame(type = "TRUE")
rp_T = st_sf(rp_T_att, geometry = rp_T)
rp_F <- st_sample(nonflood, 250)
rp_F_att = data.frame(type = "FALSE")
rp_F = st_sf(rp_F_att, geometry = rp_F)
ran_points <- rbind(rp_T, rp_F)

df_points <- terra::extract(df, ran_points)
df_points <- cbind(ran_points[c(-1)], df_points[c(-1)])
df_points <- na.omit(df_points)
st_geometry(df_points) <- "geometry"

tm_shape(area) + tm_fill(col = 'darkgrey') +
 tm_shape(df_points) + tm_dots(col = "type", size = 0.1, palette = "RdBu")

5. Random Forest Modeling

SpatialRF

Define Spatial Neighborhood

# names of the response variable and the predictors
df_points_rf <- df_points %>% st_drop_geometry()
df_points$type <- as.numeric(df_points$type)
df_points_gwrf <- df_points %>% st_drop_geometry()
dependent_variable_name <- "type"
predictor_variable_names <-  c("dem", "slope", "curv", "twi", "dist_str", "dist_coast", "ndvi", "ndbi", "drain", "manhole", "pump")

# coordinates of the cases
pcoord <- df_points %>%
    mutate(lon = st_coordinates(.)[, 1], lat = st_coordinates(.)[, 2]) %>%
    dplyr::select(lon, lat)
pcoord <- pcoord %>%
    st_drop_geometry()
colnames(pcoord) <- c("x", "y")

# distance matrix
distance_matrix <- st_distance(df_points)
distance_matrix <- as.matrix(distance_matrix)
units(distance_matrix) <- NULL

# distance thresholds (same units as distance_matrix)
distance_thresholds <- c(0, 1000, 2000, 4000, 8000, 16000, 32000)

# random seed for reproducibility
random_seed <- 1
spatialRF::plot_training_df(
  data = df_points_gwrf,
  dependent.variable.name = dependent_variable_name,
  predictor.variable.names = predictor_variable_names,
  ncol = 3,
  point.color = viridis::viridis(100, option = "F"),
  line.color = "gray30"
  )

Fitting

model.non.spatial <- spatialRF::rf(
  data = df_points_gwrf,
  dependent.variable.name = dependent_variable_name,
  predictor.variable.names = predictor_variable_names,
  distance.matrix = distance_matrix,
  distance.thresholds = distance_thresholds,
  xy = pcoord, #not needed by rf, but other functions read it from the model
  seed = random_seed,
  verbose = FALSE
)

Feature Importance

spatialRF::plot_importance(
  model.non.spatial,
  verbose = FALSE
  )

model.non.spatial <- spatialRF::rf_importance(
  model = model.non.spatial
  )

Local Importance of Feature

local.importance <- spatialRF::get_importance_local(model.non.spatial)
local.importance <- cbind(pcoord, local.importance)

color.low <- viridis::viridis(1)
color.high <- viridis::viridis(5)

p1 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = dem)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of Elevation") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p2 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = slope)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of Slope") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p3 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = curv)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of Curvature") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p4 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = twi)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of TWI") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p5 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = dist_str)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of Distance from Stream") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p6 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = dist_coast)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of Distance from Coast") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p7 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = ndvi)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of NDVI") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p8 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = ndbi)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of NDBI") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p9 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = drain)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of Drain") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p10 <- ggplot2::ggplot() +
  ggplot2::geom_sf(
    data = area,
    fill = "white") +
  ggplot2::geom_point(
    data = local.importance,
    ggplot2::aes(
      x = x,
      y = y,
      color = manhole)) +
 ggplot2::scale_color_gradient2(
    low = color.low, 
    high = color.high) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "bottom") + 
  ggplot2::ggtitle("Local Importance of Manhole") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5),
    legend.key.width = ggplot2::unit(1,"cm")) + 
  ggplot2::labs(color = "Importance") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")


p1 | p2 

p3 | p4

p5 | p6

p7 | p8

p9 | p10

Response Curve

spatialRF::plot_response_curves(
  model.non.spatial,
  quantiles = c(0.1, 0.5, 0.9),
  line.color = viridis::viridis(3,
    option = "F", 
    end = 0.9),
  ncol = 3,
  show.data = TRUE)

Partial Dependence Plot

spatialRF::plot_response_surface(
  model.non.spatial,
  a = "dem",
  b = "twi")

pdp::partial(
  model.non.spatial, 
  train = df_points_gwrf, 
  pred.var = c("dem", "twi"), 
  plot = TRUE)

Cross Validation

model.non.spatial <- spatialRF::rf_evaluate(
  model = model.non.spatial,
  xy = pcoord,             
  repetitions = 30,         
  training.fraction = 0.75,
  metrics = "r.squared",
  seed = random_seed,
  verbose = FALSE
)

pr <- pcoord[, c("x", "y")]
pr$group.2 <- pr$group.1 <- "Training"
pr[model.non.spatial$evaluation$spatial.folds[[1]]$testing, "group.1"] <- "Testing"
pr[model.non.spatial$evaluation$spatial.folds[[25]]$testing, "group.2"] <- "Testing"

p1 <- ggplot2::ggplot() +
  ggplot2::geom_sf(data = area, fill = "white") +
  ggplot2::geom_point(data = pr,
          ggplot2::aes(
            x = x,
            y = y,
            color = group.1
            ),
          size = 2
          ) +
  ggplot2::scale_color_viridis_d(
    direction = -1, 
    end = 0.5, 
    alpha = 0.8, 
    option = "F"
    ) +
  ggplot2::theme_bw() +
  ggplot2::labs(color = "Group") +
  ggplot2::ggtitle("Spatial fold 1") + 
  ggplot2::theme(
    legend.position = "none", 
    plot.title = ggplot2::element_text(hjust = 0.5)
  ) + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("Latitude")

p2 <- ggplot2::ggplot() +
  ggplot2::geom_sf(data = area, fill = "white") +
  ggplot2::geom_point(data = pr,
          ggplot2::aes(
            x = x,
            y = y,
            color = group.2
            ),
          size = 2
          ) +
  ggplot2::scale_color_viridis_d(
    direction = -1, 
    end = 0.5, 
    alpha = 0.8, 
    option = "F"
    ) +
  ggplot2::theme_bw() +
  ggplot2::labs(color = "Group") +
  ggplot2::theme(
    plot.title = ggplot2::element_text(hjust = 0.5)
  ) + 
  ggplot2::ggtitle("Spatial fold 25") + 
  ggplot2::xlab("Longitude") + 
  ggplot2::ylab("")

p1 | p2

Tidymodel

Data Split

df_points$type <- as.factor(df_points$type)
df_points$manhole <- as.numeric(df_points$manhole)

df_split <- rsample::initial_split(df_points_rf, prop = 7/10)
train <- rsample::training(df_split)
df_train <- train %>% st_drop_geometry()
test <- rsample::testing(df_split)
df_test <- test %>% st_drop_geometry()

Hyperparameter Tuning

rf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
    set_engine("randomForest", importance = TRUE) %>%
    set_mode("classification")

rf_recipe <- recipe(type ~ ., data = df_train)

rf_workflow <- workflow() %>%
    add_model(rf_mod) %>%
    add_recipe(rf_recipe)

val_set <- validation_split(df_train)

rf_res <- rf_workflow %>%
    tune_grid(val_set,
              grid = 20,
              control = control_grid(save_pred = T),
              metrics = metric_set(roc_auc))

rf_res %>%
    show_best(metric = "roc_auc")
# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     6    10 roc_auc binary     0.908     1      NA Preprocessor1_Model09
2     3     4 roc_auc binary     0.908     1      NA Preprocessor1_Model01
3     5     8 roc_auc binary     0.908     1      NA Preprocessor1_Model12
4     8    13 roc_auc binary     0.907     1      NA Preprocessor1_Model18
5     7     7 roc_auc binary     0.907     1      NA Preprocessor1_Model14
autoplot(rf_res)

rf_best <- rf_res %>%
    select_best(metric = "roc_auc")

Fitting

last_rf_mod <- finalize_model(rf_mod, rf_best)

last_rf_workflow <- 
  rf_workflow %>% 
  update_model(last_rf_mod)

last_rf_fit <- 
  last_rf_workflow %>% 
  last_fit(df_split)

Feature Importance

fi <- last_rf_fit %>% 
 pluck(".workflow", 1) %>% 
 pull_workflow_fit() %>% 
 vip()
fi <- fi$data

ggplot(data = fi, mapping = aes(x = Importance, y = reorder(Variable, Importance))) +
 geom_segment(aes(xend = 0, yend = Variable), color = "grey") +
 geom_point(aes(color = Importance, size = 5), show.legend = FALSE) +
 scale_color_gradient2(midpoint = mean(fi$Importance), low = "#FDE725FF", mid = "#1F968BFF", high = "#440154FF") +
 theme_bw() +
 theme(axis.title.y = element_blank())

Performance

last_rf_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.884 Preprocessor1_Model1
2 roc_auc  binary         0.937 Preprocessor1_Model1
rf_roc <- last_rf_fit %>% 
 collect_predictions() %>% 
 roc_curve(type, .pred_FALSE) %>%
 mutate(model = 'Random Forest')

last_rf_fit %>% 
  collect_predictions() %>% 
  roc_curve(type, .pred_FALSE) %>% 
  autoplot()

6. Spatial Effect

spatialRF::plot_training_df_moran(
  data = df_points_gwrf,
  dependent.variable.name = dependent_variable_name,
  predictor.variable.names = predictor_variable_names,
  distance.matrix = distance_matrix,
  distance.thresholds = distance_thresholds,
  fill.color = viridis::viridis(100, option = "F", direction = -1),
  point.color = "gray40"
)

spatialRF::plot_residuals_diagnostics(
  model.non.spatial,
  verbose = FALSE
  )

spatialRF::plot_moran(
  model.non.spatial, 
  verbose = FALSE)

7. Spatial Random Forest Modeling

Spatial RF

Fitting

model.spatial <- spatialRF::rf_spatial(
  model = model.non.spatial,
  method = "mem.moran.sequential", #default method
  verbose = FALSE,
  seed = random_seed
  )

spatialRF::plot_moran(
  model.spatial, 
  verbose = FALSE
  )

Feature Importance

p1 <- spatialRF::plot_importance(
  model.non.spatial,
  fill.color = c("#FDE725FF", "#1F968BFF", "#440154FF"),
  line.color = "black",
  verbose = FALSE) + 
  ggplot2::ggtitle("Non-spatial model") 

p2 <- spatialRF::plot_importance(
  model.spatial,
  fill.color = c("#FDE725FF", "#1F968BFF", "#440154FF"),
  line.color = "black",
  verbose = FALSE) + 
  ggplot2::ggtitle("Spatial model")

p1 | p2 

model.spatial.repeat <- spatialRF::rf_repeat(
  model = model.spatial, 
  repetitions = 30,
  seed = random_seed,
  verbose = FALSE
)
spatialRF::plot_importance(
  model.spatial.repeat,
  fill.color = c("#FDE725FF", "#1F968BFF", "#440154FF"),
  line.color = "black",
  verbose = FALSE
  )

Response Curve

spatialRF::plot_response_curves(
  model.spatial.repeat, 
  quantiles = 0.5,
  ncol = 3
  )

Make Spatial Predictors

spatial.predictors <- spatialRF::get_spatial_predictors(model.spatial)
pr <- data.frame(spatial.predictors, pcoord[, c("x", "y")])

Optimization

p_opt <- spatialRF::plot_optimization(model.spatial)

Tidymodel

Make Spatial Predictor

mems <- spatialRF::mem_multithreshold(
  distance.matrix = distance_matrix,
  distance.thresholds = distance_thresholds
)

mem.rank <- spatialRF::rank_spatial_predictors(
  distance.matrix = distance_matrix,
  spatial.predictors.df = mems,
  ranking.method = "moran"
)


model.formula <- as.formula(
  paste(
    dependent_variable_name,
    " ~ ",
    paste(
      predictor_variable_names,
      collapse = " + "
    )
  )
)

#scaling the data
model.data <- scale(df_points_gwrf) %>% 
  as.data.frame()

#fitting the model
m <- lm(model.formula, data = model.data)

#Moran's I test of the residuals
moran.test <- spatialRF::moran(
  x = residuals(m),
  distance.matrix = distance_matrix,
  verbose = FALSE
)
moran.test$plot

#add mems to the data and applies scale()
model.data <- data.frame(
  df_points_gwrf,
  mems
) %>%
  scale() %>%
  as.data.frame()

#initialize predictors.i
predictors.i <- predictor_variable_names

#iterating through MEMs
for(mem.i in colnames(mems)){
  
  #add mem name to model definintion
  predictors.i <- c(predictors.i, mem.i)
  
  #generate model formula with the new spatial predictor
  model.formula.i <- as.formula(
    paste(
      dependent_variable_name,
      " ~ ",
      paste(
        predictors.i,
        collapse = " + "
      )
    )
  )
  
  #fit model
  m.i <- lm(model.formula.i, data = model.data)
  
  #Moran's I test
  moran.test.i <- moran(
    x = residuals(m.i),
    distance.matrix = distance_matrix,
    verbose = FALSE
  )
  
  #stop if no autocorrelation
  if(moran.test.i$test$interpretation == "No spatial correlation"){
    break
  }
  
}#end of loop

#last moran test
moran.test.i$plot

Data Split

rank_mems <- mems[, mem.rank$ranking]
mems <- mems[1:length(predictors.i)]
df_points_gwrf <- cbind(df_points_rf, mems)

df_split <- rsample::initial_split(df_points_gwrf, prop = 7/10)
train <- rsample::training(df_split)
df_train <- train %>% st_drop_geometry()
test <- rsample::testing(df_split)
df_test <- test %>% st_drop_geometry()

Hyperparameter Tuning

all_cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(all_cores)
registerDoParallel(cl)

gwrf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
    set_engine("randomForest", importance = TRUE) %>%
    set_mode("classification")

gwrf_recipe <- recipe(type ~ ., data = df_train)

gwrf_workflow <- workflow() %>%
    add_model(gwrf_mod) %>%
    add_recipe(gwrf_recipe)

gwrf_val_set <- validation_split(df_train)

gwrf_res <- gwrf_workflow %>%
    tune_grid(gwrf_val_set,
              grid = 20,
              control = control_grid(save_pred = T),
              metrics = metric_set(roc_auc))

gwrf_res %>%
    show_best(metric = "roc_auc")
# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1    29    27 roc_auc binary     0.955     1      NA Preprocessor1_Model17
2    16    26 roc_auc binary     0.953     1      NA Preprocessor1_Model02
3    32     7 roc_auc binary     0.953     1      NA Preprocessor1_Model09
4    33    17 roc_auc binary     0.953     1      NA Preprocessor1_Model20
5    43    11 roc_auc binary     0.953     1      NA Preprocessor1_Model15
autoplot(gwrf_res)

gwrf_best <- gwrf_res %>%
    select_best(metric = "roc_auc")

Fitting

last_gwrf_mod <- finalize_model(gwrf_mod, gwrf_best)

last_gwrf_workflow <- 
  gwrf_workflow %>% 
  update_model(last_gwrf_mod)

last_gwrf_fit <- 
  last_gwrf_workflow %>% 
  last_fit(df_split)

Feature Importance

fi <- last_gwrf_fit %>% 
 pluck(".workflow", 1) %>% 
 pull_workflow_fit() %>% 
 vip()
fi <- fi$data

ggplot(data = fi, mapping = aes(x = Importance, y = reorder(Variable, Importance))) +
 geom_segment(aes(xend = 0, yend = Variable), color = "grey") +
 geom_point(aes(color = Importance, size = 5), show.legend = FALSE) +
 scale_color_gradient2(midpoint = mean(fi$Importance), low = "#FDE725FF", mid = "#1F968BFF", high = "#440154FF") +
 theme_bw() +
 theme(axis.title.y = element_blank())

Performance

last_gwrf_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.864 Preprocessor1_Model1
2 roc_auc  binary         0.936 Preprocessor1_Model1
gwrf_roc <- last_gwrf_fit %>% 
 collect_predictions() %>% 
 roc_curve(type, .pred_FALSE) %>%
 mutate(model = 'Random Forest')

last_gwrf_fit %>% 
  collect_predictions() %>% 
  roc_curve(type, .pred_FALSE) %>% 
  autoplot()

8. Model Comparison

ROC Curve

gwrf_roc$model <- "Spatial Random Forest"
  
bind_rows(rf_roc, gwrf_roc) %>% 
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = model))+
  geom_path(lwd = 1.5, alpha = 0.8) +
  geom_abline(lty = 3) +
  scale_color_manual(values = c("darkgray", "#0B2171")) +
  coord_equal() + 
  theme_bw() + 
  theme(legend.position = "bottom")

Mapping Vulnerable Area

dem[is.na(dem)] <- 0
slope[is.na(slope)] <- 0
twi[is.na(twi)] <- 0
dist_str[is.na(dist_str)] <- 0
dist_coast[is.na(dist_coast)] <- 0
ndvi[is.na(ndvi)] <- 0
ndbi[is.na(ndbi)] <- 0
drain[is.na(drain)] <- 0
manhole[is.na(manhole)] <- 0
pump[is.na(pump)] <- 0

df_pred <- c(dem, slope, curv, twi, dist_str, dist_coast, ndvi, ndbi, drain, manhole, pump)
names(df_pred) <- c("dem", "slope", "curv", "twi", "dist_str", "dist_coast", "ndvi", "ndbi", "drain", "manhole", "pump")

df_pred <- crop(df_pred, ext(c(1110000, 1170000, 1650000, 1720000)))

last_rf_fit %>% extract_fit_parsnip() %>%
  terra::predict(df_pred) -> r_pred

vect_pred <- r_pred %>% unlist() %>% as.numeric() %>% -1 %>% as.vector() %>%
  matrix(nrow = dim(df_pred)[1], dim(df_pred)[2], byrow = T) %>%
  rast(crs = terra::crs(df_pred),
       extent = terra::ext(df_pred))
vect_pred[vect_pred == 0] <- NA
vect_pred <- mask(vect_pred, area)

tmap_mode("view")
tm_shape(area) + tm_borders() +
tm_shape(vect_pred) +
  tm_raster(legend.show = TRUE, palette = "Blues", title = "Vulnerable Area") +
  tm_scale_bar()